In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
In [2]:
sheets=pd.read_excel("Regional Sales Dataset.xlsx",sheet_name=None)
In [3]:
#Assign different dataframes to each sheets
df_sales=sheets['Sales Orders']
df_customer=sheets['Customers']
df_products=sheets['Products']
df_regions=sheets['Regions']
df_state_reg=sheets['State Regions']
df_budgets=sheets['2017 Budgets']
In [4]:
print(f"df sales shape:{df_sales.shape}")
df sales shape:(64104, 12)
In [5]:
df_state_reg.head()
Out[5]:
| Column1 | Column2 | Column3 | |
|---|---|---|---|
| 0 | State Code | State | Region |
| 1 | AL | Alabama | South |
| 2 | AR | Arkansas | South |
| 3 | AZ | Arizona | West |
| 4 | CA | California | West |
In [6]:
new_header=df_state_reg.iloc[0]
df_state_reg.columns=new_header
df_state_reg=df_state_reg[1:].reset_index(drop=True)
df_state_reg.head()
Out[6]:
| State Code | State | Region | |
|---|---|---|---|
| 0 | AL | Alabama | South |
| 1 | AR | Arkansas | South |
| 2 | AZ | Arizona | West |
| 3 | CA | California | West |
| 4 | CO | Colorado | West |
In [7]:
df_budgets.columns
Out[7]:
Index(['Product Name', '2017 Budgets'], dtype='object')
Data Cleaning and Wrangling¶
In [9]:
#merging sales and customer tables by using foreign key customer index
df=df_sales.merge(
df_customer,
how='left',
left_on='Customer Name Index',
right_on='Customer Index'
)
In [10]:
df.head(1)
Out[10]:
| OrderNumber | OrderDate | Customer Name Index | Channel | Currency Code | Warehouse Code | Delivery Region Index | Product Description Index | Order Quantity | Unit Price | Line Total | Total Unit Cost | Customer Index | Customer Names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | 126 | Wholesale | USD | AXW291 | 364 | 27 | 6 | 2499.1 | 14994.6 | 1824.343 | 126 | Rhynoodle Ltd |
In [11]:
#merging with products table
df=df.merge(
df_products,
how='left',
left_on='Product Description Index',
right_on='Index'
)
In [12]:
df.head(1)
Out[12]:
| OrderNumber | OrderDate | Customer Name Index | Channel | Currency Code | Warehouse Code | Delivery Region Index | Product Description Index | Order Quantity | Unit Price | Line Total | Total Unit Cost | Customer Index | Customer Names | Index | Product Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | 126 | Wholesale | USD | AXW291 | 364 | 27 | 6 | 2499.1 | 14994.6 | 1824.343 | 126 | Rhynoodle Ltd | 27 | Product 27 |
In [13]:
#merge with Regions
df=df.merge(
df_regions,
how='left',
left_on='Delivery Region Index',
right_on='id'
)
In [14]:
df.head(1)
Out[14]:
| OrderNumber | OrderDate | Customer Name Index | Channel | Currency Code | Warehouse Code | Delivery Region Index | Product Description Index | Order Quantity | Unit Price | ... | type | latitude | longitude | area_code | population | households | median_income | land_area | water_area | time_zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | 126 | Wholesale | USD | AXW291 | 364 | 27 | 6 | 2499.1 | ... | City | 32.08354 | -81.09983 | 912 | 145674 | 52798 | 36466 | 268318796 | 13908113 | America/New York |
1 rows × 31 columns
In [15]:
#merge with state_region
df=df.merge(
df_state_reg[["State Code","Region"]],
how='left',
left_on='state_code',
right_on='State Code'
)
In [16]:
df=df.merge(
df_budgets,
how='left',
on='Product Name'
)
In [17]:
df.head(1)
Out[17]:
| OrderNumber | OrderDate | Customer Name Index | Channel | Currency Code | Warehouse Code | Delivery Region Index | Product Description Index | Order Quantity | Unit Price | ... | area_code | population | households | median_income | land_area | water_area | time_zone | State Code | Region | 2017 Budgets | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | 126 | Wholesale | USD | AXW291 | 364 | 27 | 6 | 2499.1 | ... | 912 | 145674 | 52798 | 36466 | 268318796 | 13908113 | America/New York | GA | South | 964940.231 |
1 rows × 34 columns
In [18]:
#repetative column cleanup
#customer_index
#index
#id
#State code
cols_to_drop=['Customer Index','Index','id','State Code']
df=df.drop(columns=cols_to_drop)
df.head(1)
Out[18]:
| OrderNumber | OrderDate | Customer Name Index | Channel | Currency Code | Warehouse Code | Delivery Region Index | Product Description Index | Order Quantity | Unit Price | ... | longitude | area_code | population | households | median_income | land_area | water_area | time_zone | Region | 2017 Budgets | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | 126 | Wholesale | USD | AXW291 | 364 | 27 | 6 | 2499.1 | ... | -81.09983 | 912 | 145674 | 52798 | 36466 | 268318796 | 13908113 | America/New York | South | 964940.231 |
1 rows × 30 columns
In [19]:
#convert all column names for easy accessa
df.columns=df.columns.str.lower()
df.columns.values
Out[19]:
array(['ordernumber', 'orderdate', 'customer name index', 'channel',
'currency code', 'warehouse code', 'delivery region index',
'product description index', 'order quantity', 'unit price',
'line total', 'total unit cost', 'customer names', 'product name',
'name', 'county', 'state_code', 'state', 'type', 'latitude',
'longitude', 'area_code', 'population', 'households',
'median_income', 'land_area', 'water_area', 'time_zone', 'region',
'2017 budgets'], dtype=object)
In [20]:
#now to keep the imp column and delete those columns which we think is not nessacry
cols_to_keep=[
'ordernumber',
'orderdate',
'customer names',
'channel',
'product name',
'order quantity',
'unit price',
'line total',
'total unit cost',
'state_code',
'county',
'state',
'region',
'latitude',
'longitude',
'2017 budgets'
]
In [21]:
df=df[cols_to_keep]
In [22]:
df.head()
Out[22]:
| ordernumber | orderdate | customer names | channel | product name | order quantity | unit price | line total | total unit cost | state_code | county | state | region | latitude | longitude | 2017 budgets | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | Rhynoodle Ltd | Wholesale | Product 27 | 6 | 2499.1 | 14994.6 | 1824.343 | GA | Chatham County | Georgia | South | 32.08354 | -81.09983 | 964940.231 |
| 1 | SO - 0003378 | 2014-01-01 | Thoughtmix Ltd | Distributor | Product 20 | 11 | 2351.7 | 25868.7 | 1269.918 | IN | Johnson County | Indiana | Midwest | 39.61366 | -86.10665 | 2067108.120 |
| 2 | SO - 0005126 | 2014-01-01 | Amerisourc Corp | Wholesale | Product 26 | 6 | 978.2 | 5869.2 | 684.740 | CA | Alameda County | California | West | 37.66243 | -121.87468 | 5685138.270 |
| 3 | SO - 0005614 | 2014-01-01 | Colgate-Pa Group | Export | Product 7 | 7 | 2338.3 | 16368.1 | 1028.852 | IN | Monroe County | Indiana | Midwest | 39.16533 | -86.52639 | 889737.555 |
| 4 | SO - 0005781 | 2014-01-01 | Deseret Group | Wholesale | Product 8 | 8 | 2291.4 | 18331.2 | 1260.270 | CT | Hartford County | Connecticut | Northeast | 41.77524 | -72.52443 | 1085037.329 |
In [23]:
#rename the columns
df=df.rename(columns={
'ordernumber':'order_number',
'orderdate':'order_date',
'customernames':'customer_name',
'unit price':'unit_price',
'line total':'revenue',
'total_unit_cost':'cost',
'latitude':'lat',
'longitude':'lon',
'2017 budgets':'budget'
})
In [24]:
#blank out budgets for non 2017 orders bcz of 2017 budgets table
df.loc[df['order_date'].dt.year != 2017,'budget']=pd.NA
df[['order_date','budget']].head(3)
Out[24]:
| order_date | budget | |
|---|---|---|
| 0 | 2014-01-01 | NaN |
| 1 | 2014-01-01 | NaN |
| 2 | 2014-01-01 | NaN |
In [25]:
## Now filter the dataframe to contain only the records of the year 2017
df_2017=df[df['order_date'].dt.year == 2017]
df_2017.isnull().sum()
Out[25]:
order_number 0 order_date 0 customer names 0 channel 0 product name 0 order quantity 0 unit_price 0 revenue 0 total unit cost 0 state_code 0 county 0 state 0 region 0 lat 0 lon 0 budget 0 dtype: int64
In [26]:
df['total_cost']=df['order quantity']*df['total unit cost']
df['profit']=df['revenue']-df['total_cost']
df['profit margin']=df['profit']/df['revenue']*100
df.head(3)
# 4. Extract full month name from order_date for labeling (e.g., 'January', 'February')
df['order_month_name'] = df['order_date'].dt.month_name()
# 5. Extract month number from order_date for correct sorting (1–12)
df['order_month_num'] = df['order_date'].dt.month
# 6. (Optional) Display the updated DataFrame
df
Out[26]:
| order_number | order_date | customer names | channel | product name | order quantity | unit_price | revenue | total unit cost | state_code | ... | state | region | lat | lon | budget | total_cost | profit | profit margin | order_month_name | order_month_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SO - 000225 | 2014-01-01 | Rhynoodle Ltd | Wholesale | Product 27 | 6 | 2499.1 | 14994.6 | 1824.343 | GA | ... | Georgia | South | 32.08354 | -81.09983 | NaN | 10946.058 | 4048.542 | 27.0 | January | 1 |
| 1 | SO - 0003378 | 2014-01-01 | Thoughtmix Ltd | Distributor | Product 20 | 11 | 2351.7 | 25868.7 | 1269.918 | IN | ... | Indiana | Midwest | 39.61366 | -86.10665 | NaN | 13969.098 | 11899.602 | 46.0 | January | 1 |
| 2 | SO - 0005126 | 2014-01-01 | Amerisourc Corp | Wholesale | Product 26 | 6 | 978.2 | 5869.2 | 684.740 | CA | ... | California | West | 37.66243 | -121.87468 | NaN | 4108.440 | 1760.760 | 30.0 | January | 1 |
| 3 | SO - 0005614 | 2014-01-01 | Colgate-Pa Group | Export | Product 7 | 7 | 2338.3 | 16368.1 | 1028.852 | IN | ... | Indiana | Midwest | 39.16533 | -86.52639 | NaN | 7201.964 | 9166.136 | 56.0 | January | 1 |
| 4 | SO - 0005781 | 2014-01-01 | Deseret Group | Wholesale | Product 8 | 8 | 2291.4 | 18331.2 | 1260.270 | CT | ... | Connecticut | Northeast | 41.77524 | -72.52443 | NaN | 10082.160 | 8249.040 | 45.0 | January | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 64099 | SO - 0007573 | 2018-02-28 | Dazzlesphe Corp | Wholesale | Product 26 | 12 | 1815.7 | 21788.4 | 980.478 | PA | ... | Pennsylvania | Northeast | 40.15511 | -74.82877 | NaN | 11765.736 | 10022.664 | 46.0 | February | 2 |
| 64100 | SO - 0007706 | 2018-02-28 | Yombu Corp | Export | Product 21 | 6 | 864.3 | 5185.8 | 579.081 | IL | ... | Illinois | Midwest | 42.11030 | -88.03424 | NaN | 3474.486 | 1711.314 | 33.0 | February | 2 |
| 64101 | SO - 0007718 | 2018-02-28 | Bath Group | Distributor | Product 13 | 11 | 3953.0 | 43483.0 | 2648.510 | FL | ... | Florida | South | 26.24453 | -80.20644 | NaN | 29133.610 | 14349.390 | 33.0 | February | 2 |
| 64102 | SO - 0008084 | 2018-02-28 | Linklinks Ltd | Distributor | Product 20 | 7 | 3959.7 | 27717.9 | 2930.178 | NY | ... | New York | Northeast | 42.91002 | -78.74182 | NaN | 20511.246 | 7206.654 | 26.0 | February | 2 |
| 64103 | SO - 0008654 | 2018-02-28 | SAFEWAY Ltd | Distributor | Product 15 | 8 | 998.3 | 7986.4 | 848.555 | OR | ... | Oregon | West | 45.48706 | -122.80371 | NaN | 6788.440 | 1197.960 | 15.0 | February | 2 |
64104 rows × 21 columns
In [27]:
df['order_month'] = df['order_date'].dt.to_period('M')
monthly_sales = df.groupby('order_month')['revenue'].sum()
plt.figure(figsize=(15, 4))
monthly_sales.plot(marker='o', color='navy')
from matplotlib.ticker import FuncFormatter
formatter = FuncFormatter(lambda x, pos: f'{x/1e6:.1f}M')
plt.gca().yaxis.set_major_formatter(formatter)
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Revenue (Millions)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [28]:
df_ = df[df['order_date'].dt.year != 2018] # exclude any partial-year data (i.e., January and February of 2018) so that the monthly totals aren’t skewed by an incomplete year.
#I create df_ so I don’t alter the original df, preserving it intact for other analyses while filtering just for this chart.
# 2. Group by month number and month name, sum revenue, then sort by month number
monthly_sales = (
df_
.groupby(['order_month_num', 'order_month_name'])['revenue']
.sum()
.sort_index()
)
# 3. Plot setup
from matplotlib.ticker import FuncFormatter
plt.figure(figsize=(13, 4))
plt.plot(
monthly_sales.index.get_level_values(1), # X-axis: month names
monthly_sales.values, # Y-axis: total revenue
marker='o', # circle markers
color='navy' # line color
)
# 4. Scale y-axis values to millions for readability
formatter = FuncFormatter(lambda x, pos: f'{x/1e6:.1f}M')
plt.gca().yaxis.set_major_formatter(formatter)
# 5. Add title and axis labels
plt.title('Overall Monthly Sales Trend (Excluding 2018)')
plt.xlabel('Month')
plt.ylabel('Total Revenue (Millions)')
# 6. Rotate x-axis labels for readability
plt.xticks(rotation=45)
# 7. Adjust layout to prevent clipping
plt.tight_layout()
# 8. Display the plot
plt.show()
In [29]:
top_margin = (
df.groupby('product name')['profit']
.mean() # Calculate mean profit for each product
.sort_values(ascending=False) # Sort from highest to lowest average profit
.head(10) # Keep only the top 10 products
)
plt.figure(figsize=(9, 4))
sns.barplot(
x=top_margin.values, # X-axis: average profit values
y=top_margin.index, # Y-axis: product names
palette='viridis' # Color palette for the bars
)
plt.title('Top 10 Products by Avg Profit Margin') # Chart title
plt.xlabel('Average Profit Margin (USD)') # X-axis label
plt.ylabel('Product Name') # Y-axis label
plt.tight_layout()
plt.show()
C:\Users\prajw\AppData\Local\Temp\ipykernel_2176\601553980.py:10: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
In [30]:
# Group revenue by sales channel and sort descending
chan_sales = df.groupby('channel')['revenue'].sum().sort_values(ascending=False)
# Set figure size for the pie chart
plt.figure(figsize=(5, 5))
# Plot pie chart with percentage labels and a defined start angle
plt.pie(
chan_sales.values, # Data: revenue values per channel
labels=chan_sales.index, # Labels: channel names
autopct='%1.1f%%', # Display percentages with one decimal
startangle=140, # Rotate chart so first slice starts at 140 degrees
colors=sns.color_palette('coolwarm') # Color palette for slices
)
# Add title for context
plt.title('Total Sales by Channel')
# Adjust layout to ensure everything fits well
plt.tight_layout()
# Display the chart
plt.show()
In [31]:
# Calculate the total revenue for each order to get the order value
aov = df.groupby('order_number')['revenue'].sum()
# Set the figure size for better visibility
plt.figure(figsize=(12, 4))
# Plot a histogram of order values
plt.hist(
aov, # Data: list of order values
bins=50, # Number of bins to group order values
color='skyblue', # Fill color of the bars
edgecolor='black' # Outline color of the bars
)
# Add title and axis labels for context
plt.title('Distribution of Average Order Value')
plt.xlabel('Order Value (USD)')
plt.ylabel('Number of Orders')
# Adjust layout to prevent clipping
plt.tight_layout()
# Show the plot
plt.show()
In [32]:
# Calculate profit margin percentage for each record
df['profit_margin_pct'] = (df['profit'] / df['revenue']) * 100
# Set the figure size for clarity
plt.figure(figsize=(6,4))
# Plot unit price vs. profit margin percentage
plt.scatter(
df['unit_price'], # X-axis: unit price in USD
df['profit_margin_pct'], # Y-axis: profit margin percentage
alpha=0.6, # Transparency level for overlapping points
color='green' # Point color
)
# Add title and axis labels
plt.title('Profit Margin % vs. Unit Price') # Chart title
plt.xlabel('Unit Price (USD)') # X-axis label
plt.ylabel('Profit Margin (%)') # Y-axis label
# Adjust layout to prevent clipping
plt.tight_layout()
# Display the plot
plt.show()
In [33]:
# Set figure size for clarity
plt.figure(figsize=(12,4))
# Create a boxplot of unit_price by product_name
sns.boxplot(
data=df,
x='product name', # X-axis: product categories
y='unit_price', # Y-axis: unit price values
color='g' # Box color
)
# Add title and axis labels
plt.title('Unit Price Distribution per Product') # Chart title
plt.xlabel('Product') # X-axis label
plt.ylabel('Unit Price (USD)') # Y-axis label
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# Adjust layout to prevent clipping of labels
plt.tight_layout()
# Display the plot
plt.show()
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Aggregate total sales by region (in millions)
region_sales = (
df
.groupby('region')['revenue']
.sum()
.sort_values(ascending=False) # so bars go top→bottom
/ 1e6 # convert to millions
)
# 2. Plot
plt.figure(figsize=(10, 4))
sns.barplot(
x=region_sales.values,
y=region_sales.index,
palette='Greens_r' # dark→light green
)
# 3. Formatting
plt.title('Total Sales by US Region', fontsize=16, pad=12)
plt.xlabel('Total Sales (in Millions USD)', fontsize=12)
plt.ylabel('US Region', fontsize=12)
plt.xticks(rotation=0)
sns.despine(left=True, bottom=True)
plt.tight_layout()
plt.show()
C:\Users\prajw\AppData\Local\Temp\ipykernel_2176\1217761970.py:15: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
In [35]:
import plotly.express as px
# 1. Aggregate revenue by state (in millions)
state_sales = (
df
.groupby('state')['revenue']
.sum()
.reset_index()
)
state_sales['revenue_m'] = state_sales['revenue'] / 1e6 # convert to millions
# 2. Plotly choropleth
fig = px.choropleth(
state_sales,
locations='state', # column with state codes
locationmode='USA-states', # tells Plotly these are US states
color='revenue_m',
scope='usa',
labels={'revenue_m':'Total Sales (M USD)'},
color_continuous_scale='Blues',
hover_data={'revenue_m':':.2f'} # show 2 decimals
)
# 3. Layout tuning
fig.update_layout(
title_text='Total Sales by State',
margin=dict(l=0, r=0, t=40, b=0),
coloraxis_colorbar=dict(
title='Sales (M USD)',
ticksuffix='M'
)
)
fig.show()
In [ ]:
df.to_csv('Sales_data(EDA Exported).csv', index=False)
In [ ]:
df= df[df['order_date'].dt.year != 2018]
In [ ]:
df.to_csv('Sales_data(EDA Exported).csv', index=False)
In [ ]: